Importing relevant libraries
library(data.table)
library(mltools)
library(DMwR)
library(plyr)
library(dplyr)
library(caTools)
library(caret)
library(e1071)
library(corrplot)
library("arules")
library(nnet)
library(randomForest)
library(Boruta)
#install.packages("scales")
library(plotly)
#install.packages('arulesViz')
set.seed(1000)
Importing dataset for initial preparation
clamp <- fread("ClaMP_Raw-5184.csv")
clamp <- lapply(clamp, as.numeric)
clamp <- data.frame(clamp)
str(clamp)
'data.frame': 5184 obs. of 56 variables:
$ e_magic : num 23117 23117 23117 23117 23117 ...
$ e_cblp : num 144 144 144 144 144 80 144 144 144 144 ...
$ e_cp : num 3 3 3 3 3 2 3 3 3 3 ...
$ e_crlc : num 0 0 0 0 0 0 0 0 0 0 ...
$ e_cparhdr : num 4 4 4 4 4 4 4 4 4 4 ...
$ e_minalloc : num 0 0 0 0 0 15 0 0 0 0 ...
$ e_maxalloc : num 65535 65535 65535 65535 65535 ...
$ e_ss : num 0 0 0 0 0 0 0 0 0 0 ...
$ e_sp : num 184 184 184 184 184 184 184 184 184 184 ...
$ e_csum : num 0 0 0 0 0 0 0 0 0 0 ...
$ e_ip : num 0 0 0 0 0 0 0 0 0 0 ...
$ e_cs : num 0 0 0 0 0 0 0 0 0 0 ...
$ e_lfarlc : num 64 64 64 64 64 64 64 64 64 64 ...
$ e_ovno : num 0 0 0 0 0 26 0 0 0 0 ...
$ e_res : num NA NA NA NA NA NA NA NA NA NA ...
$ e_oemid : num 0 0 0 0 0 0 0 0 0 0 ...
$ e_oeminfo : num 0 0 0 0 0 0 0 0 0 0 ...
$ e_res2 : num NA NA NA NA NA NA NA NA NA NA ...
$ e_lfanew : num 256 184 272 184 224 256 272 256 240 224 ...
$ Machine : num 332 332 332 332 332 332 332 332 332 332 ...
$ NumberOfSections : num 4 4 5 1 5 8 8 5 5 6 ...
$ CreationYear : num 2006 1999 2012 2011 2012 ...
$ PointerToSymbolTable : num 0 0 0 0 0 0 0 0 0 0 ...
$ NumberOfSymbols : num 0 0 0 0 0 0 0 0 0 0 ...
$ SizeOfOptionalHeader : num 224 224 224 224 224 224 224 224 224 224 ...
$ Characteristics : num 8450 8462 8450 8450 258 ...
$ Magic : num 267 267 267 267 267 267 267 267 267 267 ...
$ MajorLinkerVersion : num 8 5 9 9 10 2 6 8 10 9 ...
$ MinorLinkerVersion : num 0 10 0 0 10 25 0 0 10 0 ...
$ SizeOfCode : num 1100288 4096 27648 0 11776 ...
$ SizeOfInitializedData : num 225792 2560 20480 87552 36352 ...
$ SizeOfUninitializedData : num 0 0 0 0 0 0 0 0 0 0 ...
$ AddressOfEntryPoint : num 1069880 7680 28859 0 13379 ...
$ BaseOfCode : num 4096 4096 4096 4096 4096 ...
$ BaseOfData : num 1110016 8192 32768 4096 16384 ...
$ ImageBase : num 1.18e+09 2.68e+08 2.68e+08 2.68e+08 4.19e+06 ...
$ SectionAlignment : num 4096 4096 4096 4096 4096 ...
$ FileAlignment : num 512 512 512 512 512 ...
$ MajorOperatingSystemVersion: num 4 4 5 6 6 1 4 4 6 5 ...
$ MinorOperatingSystemVersion: num 0 0 0 1 2 0 0 0 2 0 ...
$ MajorImageVersion : num 0 0 0 6 6 0 0 0 6 0 ...
$ MinorImageVersion : num 0 0 0 1 2 0 0 0 2 0 ...
$ MajorSubsystemVersion : num 5 4 5 5 6 4 4 4 6 5 ...
$ MinorSubsystemVersion : num 1 0 0 0 2 0 0 0 2 0 ...
$ SizeOfImage : num 1335296 20480 61440 94208 57344 ...
$ SizeOfHeaders : num 1024 1024 1024 512 1024 ...
$ CheckSum : num 1194954 0 67688 113668 69089 ...
$ Subsystem : num 3 2 2 2 2 2 2 2 3 3 ...
$ DllCharacteristics : num 64 0 320 1344 33088 ...
$ SizeOfStackReserve : num 1048576 1048576 1048576 1048576 262144 ...
$ SizeOfStackCommit : num 4096 4096 4096 4096 8192 ...
$ SizeOfHeapReserve : num 1048576 1048576 1048576 1048576 1048576 ...
$ SizeOfHeapCommit : num 4096 4096 4096 4096 4096 ...
$ LoaderFlags : num 0 0 0 0 0 0 0 0 0 0 ...
$ NumberOfRvaAndSizes : num 16 16 16 16 16 16 16 16 16 16 ...
$ class : num 0 0 0 0 0 0 0 0 0 0 ...
Remove empty columns
clamp$e_res <- NULL
clamp$e_res2 <- NULL
clamp$e_magic <- NULL
clamp$e_crlc <- NULL
row.has.na <- apply(clamp, 1, function(x){any(is.na(x))})
row.with.na <- clamp[row.has.na,]
str(clamp)
'data.frame': 5184 obs. of 52 variables:
$ e_cblp : num 144 144 144 144 144 80 144 144 144 144 ...
$ e_cp : num 3 3 3 3 3 2 3 3 3 3 ...
$ e_cparhdr : num 4 4 4 4 4 4 4 4 4 4 ...
$ e_minalloc : num 0 0 0 0 0 15 0 0 0 0 ...
$ e_maxalloc : num 65535 65535 65535 65535 65535 ...
$ e_ss : num 0 0 0 0 0 0 0 0 0 0 ...
$ e_sp : num 184 184 184 184 184 184 184 184 184 184 ...
$ e_csum : num 0 0 0 0 0 0 0 0 0 0 ...
$ e_ip : num 0 0 0 0 0 0 0 0 0 0 ...
$ e_cs : num 0 0 0 0 0 0 0 0 0 0 ...
$ e_lfarlc : num 64 64 64 64 64 64 64 64 64 64 ...
$ e_ovno : num 0 0 0 0 0 26 0 0 0 0 ...
$ e_oemid : num 0 0 0 0 0 0 0 0 0 0 ...
$ e_oeminfo : num 0 0 0 0 0 0 0 0 0 0 ...
$ e_lfanew : num 256 184 272 184 224 256 272 256 240 224 ...
$ Machine : num 332 332 332 332 332 332 332 332 332 332 ...
$ NumberOfSections : num 4 4 5 1 5 8 8 5 5 6 ...
$ CreationYear : num 2006 1999 2012 2011 2012 ...
$ PointerToSymbolTable : num 0 0 0 0 0 0 0 0 0 0 ...
$ NumberOfSymbols : num 0 0 0 0 0 0 0 0 0 0 ...
$ SizeOfOptionalHeader : num 224 224 224 224 224 224 224 224 224 224 ...
$ Characteristics : num 8450 8462 8450 8450 258 ...
$ Magic : num 267 267 267 267 267 267 267 267 267 267 ...
$ MajorLinkerVersion : num 8 5 9 9 10 2 6 8 10 9 ...
$ MinorLinkerVersion : num 0 10 0 0 10 25 0 0 10 0 ...
$ SizeOfCode : num 1100288 4096 27648 0 11776 ...
$ SizeOfInitializedData : num 225792 2560 20480 87552 36352 ...
$ SizeOfUninitializedData : num 0 0 0 0 0 0 0 0 0 0 ...
$ AddressOfEntryPoint : num 1069880 7680 28859 0 13379 ...
$ BaseOfCode : num 4096 4096 4096 4096 4096 ...
$ BaseOfData : num 1110016 8192 32768 4096 16384 ...
$ ImageBase : num 1.18e+09 2.68e+08 2.68e+08 2.68e+08 4.19e+06 ...
$ SectionAlignment : num 4096 4096 4096 4096 4096 ...
$ FileAlignment : num 512 512 512 512 512 ...
$ MajorOperatingSystemVersion: num 4 4 5 6 6 1 4 4 6 5 ...
$ MinorOperatingSystemVersion: num 0 0 0 1 2 0 0 0 2 0 ...
$ MajorImageVersion : num 0 0 0 6 6 0 0 0 6 0 ...
$ MinorImageVersion : num 0 0 0 1 2 0 0 0 2 0 ...
$ MajorSubsystemVersion : num 5 4 5 5 6 4 4 4 6 5 ...
$ MinorSubsystemVersion : num 1 0 0 0 2 0 0 0 2 0 ...
$ SizeOfImage : num 1335296 20480 61440 94208 57344 ...
$ SizeOfHeaders : num 1024 1024 1024 512 1024 ...
$ CheckSum : num 1194954 0 67688 113668 69089 ...
$ Subsystem : num 3 2 2 2 2 2 2 2 3 3 ...
$ DllCharacteristics : num 64 0 320 1344 33088 ...
$ SizeOfStackReserve : num 1048576 1048576 1048576 1048576 262144 ...
$ SizeOfStackCommit : num 4096 4096 4096 4096 8192 ...
$ SizeOfHeapReserve : num 1048576 1048576 1048576 1048576 1048576 ...
$ SizeOfHeapCommit : num 4096 4096 4096 4096 4096 ...
$ LoaderFlags : num 0 0 0 0 0 0 0 0 0 0 ...
$ NumberOfRvaAndSizes : num 16 16 16 16 16 16 16 16 16 16 ...
$ class : num 0 0 0 0 0 0 0 0 0 0 ...
Based on the structure observed, we add and rename columns to make dataset more relevant for Tesla
# Rename columns
colnames(clamp)[which(names(clamp) == "ImageBase")] <- "ChargeCycles"
colnames(clamp)[which(names(clamp) == "SizeOfImage")] <- "CarMileage"
colnames(clamp)[which(names(clamp) == "CreationYear")] <- "YearObtained"
colnames(clamp)[which(names(clamp) == "MajorSubsystemVersion")] <- "SoftwareVersion"
colnames(clamp)[which(names(clamp) == "MinorSubsystemVersion")] <- "OSVersion"
colnames(clamp)[which(names(clamp) == "Machine")] <- "Models"
colnames(clamp)[which(names(clamp) == "class")] <- "MalwareDetection"
# Adding in new column
teslacountries <- fread("TeslaCountries.csv")
clamp <- clamp %>% left_join(teslacountries, by = c("e_lfanew" = "CountryID"))
clamp$e_lfanew <- NULL
#clamp$Country <- as.factor(clamp$Country)
Converting to numeric columns
clamp_num_names <- c("NumberOfSections", "NumberOfSymbols", "SizeOfOptionalHeader", "ChargeCycles", "SizeOfInitializedData", "SizeOfUninitializedData", "AddressOfEntryPoint", "BaseOfCode", "BaseOfData", "SizeOfCode", "CarMileage", "SizeOfHeaders", "CheckSum", "SizeOfStackReserve", "SizeOfStackCommit", "SizeOfHeapReserve", "SizeOfHeapCommit", "NumberOfRvaAndSizes")
clamp_num <- clamp[names(clamp) %in% clamp_num_names]
num_names <- names(clamp_num)
clamp_num <- lapply(clamp_num, as.numeric)
clamp_num <- data.frame(clamp_num)
str(clamp_num)
'data.frame': 5184 obs. of 18 variables:
$ NumberOfSections : num 4 4 5 1 5 8 8 5 5 6 ...
$ NumberOfSymbols : num 0 0 0 0 0 0 0 0 0 0 ...
$ SizeOfOptionalHeader : num 224 224 224 224 224 224 224 224 224 224 ...
$ SizeOfCode : num 1100288 4096 27648 0 11776 ...
$ SizeOfInitializedData : num 225792 2560 20480 87552 36352 ...
$ SizeOfUninitializedData: num 0 0 0 0 0 0 0 0 0 0 ...
$ AddressOfEntryPoint : num 1069880 7680 28859 0 13379 ...
$ BaseOfCode : num 4096 4096 4096 4096 4096 ...
$ BaseOfData : num 1110016 8192 32768 4096 16384 ...
$ ChargeCycles : num 1.18e+09 2.68e+08 2.68e+08 2.68e+08 4.19e+06 ...
$ CarMileage : num 1335296 20480 61440 94208 57344 ...
$ SizeOfHeaders : num 1024 1024 1024 512 1024 ...
$ CheckSum : num 1194954 0 67688 113668 69089 ...
$ SizeOfStackReserve : num 1048576 1048576 1048576 1048576 262144 ...
$ SizeOfStackCommit : num 4096 4096 4096 4096 8192 ...
$ SizeOfHeapReserve : num 1048576 1048576 1048576 1048576 1048576 ...
$ SizeOfHeapCommit : num 4096 4096 4096 4096 4096 ...
$ NumberOfRvaAndSizes : num 16 16 16 16 16 16 16 16 16 16 ...
Converting the remaining to categorical columns
clamp_cat <- clamp
clamp_cat[, num_names] <- list(NULL)
clamp_cat <- lapply(clamp_cat, factor)
clamp_cat <- data.frame(clamp_cat)
str(clamp_cat)
'data.frame': 5184 obs. of 34 variables:
$ e_cblp : Factor w/ 9 levels "0","10","46",..: 8 8 8 8 8 5 8 8 8 8 ...
$ e_cp : Factor w/ 7 levels "0","1","2","3",..: 4 4 4 4 4 3 4 4 4 4 ...
$ e_cparhdr : Factor w/ 3 levels "0","2","4": 3 3 3 3 3 3 3 3 3 3 ...
$ e_minalloc : Factor w/ 4 levels "0","15","16",..: 1 1 1 1 1 2 1 1 1 1 ...
$ e_maxalloc : Factor w/ 3 levels "0","17744","65535": 3 3 3 3 3 3 3 3 3 3 ...
$ e_ss : Factor w/ 2 levels "0","65520": 1 1 1 1 1 1 1 1 1 1 ...
$ e_sp : Factor w/ 8 levels "0","40","64",..: 4 4 4 4 4 4 4 4 4 4 ...
$ e_csum : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
$ e_ip : Factor w/ 4 levels "0","256","1047",..: 1 1 1 1 1 1 1 1 1 1 ...
$ e_cs : Factor w/ 4 levels "0","18293","18919",..: 1 1 1 1 1 1 1 1 1 1 ...
$ e_lfarlc : Factor w/ 3 levels "0","64","65": 2 2 2 2 2 2 2 2 2 2 ...
$ e_ovno : Factor w/ 2 levels "0","26": 1 1 1 1 1 2 1 1 1 1 ...
$ e_oemid : Factor w/ 2 levels "0","267": 1 1 1 1 1 1 1 1 1 1 ...
$ e_oeminfo : Factor w/ 3 levels "0","6","8": 1 1 1 1 1 1 1 1 1 1 ...
$ Models : Factor w/ 3 levels "332","448","34404": 1 1 1 1 1 1 1 1 1 1 ...
$ YearObtained : Factor w/ 36 levels "1970","1971",..: 16 9 22 21 22 4 22 22 22 20 ...
$ PointerToSymbolTable : Factor w/ 9 levels "0","36384","109088",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Characteristics : Factor w/ 42 levels "34","35","258",..: 27 30 27 27 3 40 30 27 3 11 ...
$ Magic : Factor w/ 2 levels "0","267": 2 2 2 2 2 2 2 2 2 2 ...
$ MajorLinkerVersion : Factor w/ 23 levels "0","1","2","3",..: 9 6 10 10 11 3 7 9 11 10 ...
$ MinorLinkerVersion : Factor w/ 36 levels "0","1","2","3",..: 1 11 1 1 11 22 1 1 11 1 ...
$ SectionAlignment : Factor w/ 6 levels "0","128","256",..: 5 5 5 5 5 5 5 5 5 5 ...
$ FileAlignment : Factor w/ 7 levels "0","128","256",..: 4 4 4 4 4 4 7 7 4 4 ...
$ MajorOperatingSystemVersion: Factor w/ 10 levels "0","1","2","4",..: 4 4 5 6 6 2 4 4 6 5 ...
$ MinorOperatingSystemVersion: Factor w/ 10 levels "0","1","2","3",..: 1 1 1 2 3 1 1 1 3 1 ...
$ MajorImageVersion : Factor w/ 41 levels "0","1","2","3",..: 1 1 1 7 7 1 1 1 7 1 ...
$ MinorImageVersion : Factor w/ 53 levels "0","1","2","3",..: 1 1 1 2 3 1 1 1 3 1 ...
$ SoftwareVersion : Factor w/ 6 levels "0","1","3","4",..: 5 4 5 5 6 4 4 4 6 5 ...
$ OSVersion : Factor w/ 5 levels "0","1","2","10",..: 2 1 1 1 3 1 1 1 3 1 ...
$ Subsystem : Factor w/ 6 levels "0","1","2","3",..: 4 3 3 3 3 3 3 3 4 4 ...
$ DllCharacteristics : Factor w/ 25 levels "0","1","2","3",..: 5 1 7 12 17 1 1 1 17 17 ...
$ LoaderFlags : Factor w/ 6 levels "0","4357151",..: 1 1 1 1 1 1 1 1 1 1 ...
$ MalwareDetection : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ Country : Factor w/ 39 levels "Australia","Austria",..: 5 3 32 3 25 5 32 5 39 25 ...
Rename factor values (How to hide the revalue results?)
clamp_cat$Models <- revalue(clamp_cat$Models, c("332"="Model X", "448"="Model Y", '34404'= 'Model S'))
clamp_cat$OSVersion <- revalue(clamp_cat$OSVersion, c("0"="V5", "1"="V4", '2'= 'V3', '10' = 'V2', '20' = 'V1'))
Final dataset
clamp_model <- data.frame(clamp_cat, clamp_num)
str(clamp_model)
'data.frame': 5184 obs. of 52 variables:
$ e_cblp : Factor w/ 9 levels "0","10","46",..: 8 8 8 8 8 5 8 8 8 8 ...
$ e_cp : Factor w/ 7 levels "0","1","2","3",..: 4 4 4 4 4 3 4 4 4 4 ...
$ e_cparhdr : Factor w/ 3 levels "0","2","4": 3 3 3 3 3 3 3 3 3 3 ...
$ e_minalloc : Factor w/ 4 levels "0","15","16",..: 1 1 1 1 1 2 1 1 1 1 ...
$ e_maxalloc : Factor w/ 3 levels "0","17744","65535": 3 3 3 3 3 3 3 3 3 3 ...
$ e_ss : Factor w/ 2 levels "0","65520": 1 1 1 1 1 1 1 1 1 1 ...
$ e_sp : Factor w/ 8 levels "0","40","64",..: 4 4 4 4 4 4 4 4 4 4 ...
$ e_csum : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
$ e_ip : Factor w/ 4 levels "0","256","1047",..: 1 1 1 1 1 1 1 1 1 1 ...
$ e_cs : Factor w/ 4 levels "0","18293","18919",..: 1 1 1 1 1 1 1 1 1 1 ...
$ e_lfarlc : Factor w/ 3 levels "0","64","65": 2 2 2 2 2 2 2 2 2 2 ...
$ e_ovno : Factor w/ 2 levels "0","26": 1 1 1 1 1 2 1 1 1 1 ...
$ e_oemid : Factor w/ 2 levels "0","267": 1 1 1 1 1 1 1 1 1 1 ...
$ e_oeminfo : Factor w/ 3 levels "0","6","8": 1 1 1 1 1 1 1 1 1 1 ...
$ Models : Factor w/ 3 levels "Model X","Model Y",..: 1 1 1 1 1 1 1 1 1 1 ...
$ YearObtained : Factor w/ 36 levels "1970","1971",..: 16 9 22 21 22 4 22 22 22 20 ...
$ PointerToSymbolTable : Factor w/ 9 levels "0","36384","109088",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Characteristics : Factor w/ 42 levels "34","35","258",..: 27 30 27 27 3 40 30 27 3 11 ...
$ Magic : Factor w/ 2 levels "0","267": 2 2 2 2 2 2 2 2 2 2 ...
$ MajorLinkerVersion : Factor w/ 23 levels "0","1","2","3",..: 9 6 10 10 11 3 7 9 11 10 ...
$ MinorLinkerVersion : Factor w/ 36 levels "0","1","2","3",..: 1 11 1 1 11 22 1 1 11 1 ...
$ SectionAlignment : Factor w/ 6 levels "0","128","256",..: 5 5 5 5 5 5 5 5 5 5 ...
$ FileAlignment : Factor w/ 7 levels "0","128","256",..: 4 4 4 4 4 4 7 7 4 4 ...
$ MajorOperatingSystemVersion: Factor w/ 10 levels "0","1","2","4",..: 4 4 5 6 6 2 4 4 6 5 ...
$ MinorOperatingSystemVersion: Factor w/ 10 levels "0","1","2","3",..: 1 1 1 2 3 1 1 1 3 1 ...
$ MajorImageVersion : Factor w/ 41 levels "0","1","2","3",..: 1 1 1 7 7 1 1 1 7 1 ...
$ MinorImageVersion : Factor w/ 53 levels "0","1","2","3",..: 1 1 1 2 3 1 1 1 3 1 ...
$ SoftwareVersion : Factor w/ 6 levels "0","1","3","4",..: 5 4 5 5 6 4 4 4 6 5 ...
$ OSVersion : Factor w/ 5 levels "V5","V4","V3",..: 2 1 1 1 3 1 1 1 3 1 ...
$ Subsystem : Factor w/ 6 levels "0","1","2","3",..: 4 3 3 3 3 3 3 3 4 4 ...
$ DllCharacteristics : Factor w/ 25 levels "0","1","2","3",..: 5 1 7 12 17 1 1 1 17 17 ...
$ LoaderFlags : Factor w/ 6 levels "0","4357151",..: 1 1 1 1 1 1 1 1 1 1 ...
$ MalwareDetection : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ Country : Factor w/ 39 levels "Australia","Austria",..: 5 3 32 3 25 5 32 5 39 25 ...
$ NumberOfSections : num 4 4 5 1 5 8 8 5 5 6 ...
$ NumberOfSymbols : num 0 0 0 0 0 0 0 0 0 0 ...
$ SizeOfOptionalHeader : num 224 224 224 224 224 224 224 224 224 224 ...
$ SizeOfCode : num 1100288 4096 27648 0 11776 ...
$ SizeOfInitializedData : num 225792 2560 20480 87552 36352 ...
$ SizeOfUninitializedData : num 0 0 0 0 0 0 0 0 0 0 ...
$ AddressOfEntryPoint : num 1069880 7680 28859 0 13379 ...
$ BaseOfCode : num 4096 4096 4096 4096 4096 ...
$ BaseOfData : num 1110016 8192 32768 4096 16384 ...
$ ChargeCycles : num 1.18e+09 2.68e+08 2.68e+08 2.68e+08 4.19e+06 ...
$ CarMileage : num 1335296 20480 61440 94208 57344 ...
$ SizeOfHeaders : num 1024 1024 1024 512 1024 ...
$ CheckSum : num 1194954 0 67688 113668 69089 ...
$ SizeOfStackReserve : num 1048576 1048576 1048576 1048576 262144 ...
$ SizeOfStackCommit : num 4096 4096 4096 4096 8192 ...
$ SizeOfHeapReserve : num 1048576 1048576 1048576 1048576 1048576 ...
$ SizeOfHeapCommit : num 4096 4096 4096 4096 4096 ...
$ NumberOfRvaAndSizes : num 16 16 16 16 16 16 16 16 16 16 ...
fwrite(clamp_model, file="TableauClampData.csv")
names(clamp_model)
[1] "e_cblp" "e_cp" "e_cparhdr"
[4] "e_minalloc" "e_maxalloc" "e_ss"
[7] "e_sp" "e_csum" "e_ip"
[10] "e_cs" "e_lfarlc" "e_ovno"
[13] "e_oemid" "e_oeminfo" "Models"
[16] "YearObtained" "PointerToSymbolTable" "Characteristics"
[19] "Magic" "MajorLinkerVersion" "MinorLinkerVersion"
[22] "SectionAlignment" "FileAlignment" "MajorOperatingSystemVersion"
[25] "MinorOperatingSystemVersion" "MajorImageVersion" "MinorImageVersion"
[28] "SoftwareVersion" "OSVersion" "Subsystem"
[31] "DllCharacteristics" "LoaderFlags" "MalwareDetection"
[34] "Country" "NumberOfSections" "NumberOfSymbols"
[37] "SizeOfOptionalHeader" "SizeOfCode" "SizeOfInitializedData"
[40] "SizeOfUninitializedData" "AddressOfEntryPoint" "BaseOfCode"
[43] "BaseOfData" "ChargeCycles" "CarMileage"
[46] "SizeOfHeaders" "CheckSum" "SizeOfStackReserve"
[49] "SizeOfStackCommit" "SizeOfHeapReserve" "SizeOfHeapCommit"
[52] "NumberOfRvaAndSizes"
clamp_corr <- clamp_model
clamp_corr <- data.frame(lapply(clamp_corr, as.numeric))
corrplot(cor(clamp_corr), type = "upper", title = "Correlation Plot for Final Dataset", mar=c(0,0,1,0),
tl.cex=0.5,
tl.col = "black")
str(clamp_model)
'data.frame': 5184 obs. of 52 variables:
$ e_cblp : Factor w/ 9 levels "0","10","46",..: 8 8 8 8 8 5 8 8 8 8 ...
$ e_cp : Factor w/ 7 levels "0","1","2","3",..: 4 4 4 4 4 3 4 4 4 4 ...
$ e_cparhdr : Factor w/ 3 levels "0","2","4": 3 3 3 3 3 3 3 3 3 3 ...
$ e_minalloc : Factor w/ 4 levels "0","15","16",..: 1 1 1 1 1 2 1 1 1 1 ...
$ e_maxalloc : Factor w/ 3 levels "0","17744","65535": 3 3 3 3 3 3 3 3 3 3 ...
$ e_ss : Factor w/ 2 levels "0","65520": 1 1 1 1 1 1 1 1 1 1 ...
$ e_sp : Factor w/ 8 levels "0","40","64",..: 4 4 4 4 4 4 4 4 4 4 ...
$ e_csum : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
$ e_ip : Factor w/ 4 levels "0","256","1047",..: 1 1 1 1 1 1 1 1 1 1 ...
$ e_cs : Factor w/ 4 levels "0","18293","18919",..: 1 1 1 1 1 1 1 1 1 1 ...
$ e_lfarlc : Factor w/ 3 levels "0","64","65": 2 2 2 2 2 2 2 2 2 2 ...
$ e_ovno : Factor w/ 2 levels "0","26": 1 1 1 1 1 2 1 1 1 1 ...
$ e_oemid : Factor w/ 2 levels "0","267": 1 1 1 1 1 1 1 1 1 1 ...
$ e_oeminfo : Factor w/ 3 levels "0","6","8": 1 1 1 1 1 1 1 1 1 1 ...
$ Models : Factor w/ 3 levels "Model X","Model Y",..: 1 1 1 1 1 1 1 1 1 1 ...
$ YearObtained : Factor w/ 36 levels "1970","1971",..: 16 9 22 21 22 4 22 22 22 20 ...
$ PointerToSymbolTable : Factor w/ 9 levels "0","36384","109088",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Characteristics : Factor w/ 42 levels "34","35","258",..: 27 30 27 27 3 40 30 27 3 11 ...
$ Magic : Factor w/ 2 levels "0","267": 2 2 2 2 2 2 2 2 2 2 ...
$ MajorLinkerVersion : Factor w/ 23 levels "0","1","2","3",..: 9 6 10 10 11 3 7 9 11 10 ...
$ MinorLinkerVersion : Factor w/ 36 levels "0","1","2","3",..: 1 11 1 1 11 22 1 1 11 1 ...
$ SectionAlignment : Factor w/ 6 levels "0","128","256",..: 5 5 5 5 5 5 5 5 5 5 ...
$ FileAlignment : Factor w/ 7 levels "0","128","256",..: 4 4 4 4 4 4 7 7 4 4 ...
$ MajorOperatingSystemVersion: Factor w/ 10 levels "0","1","2","4",..: 4 4 5 6 6 2 4 4 6 5 ...
$ MinorOperatingSystemVersion: Factor w/ 10 levels "0","1","2","3",..: 1 1 1 2 3 1 1 1 3 1 ...
$ MajorImageVersion : Factor w/ 41 levels "0","1","2","3",..: 1 1 1 7 7 1 1 1 7 1 ...
$ MinorImageVersion : Factor w/ 53 levels "0","1","2","3",..: 1 1 1 2 3 1 1 1 3 1 ...
$ SoftwareVersion : Factor w/ 6 levels "0","1","3","4",..: 5 4 5 5 6 4 4 4 6 5 ...
$ OSVersion : Factor w/ 5 levels "V5","V4","V3",..: 2 1 1 1 3 1 1 1 3 1 ...
$ Subsystem : Factor w/ 6 levels "0","1","2","3",..: 4 3 3 3 3 3 3 3 4 4 ...
$ DllCharacteristics : Factor w/ 25 levels "0","1","2","3",..: 5 1 7 12 17 1 1 1 17 17 ...
$ LoaderFlags : Factor w/ 6 levels "0","4357151",..: 1 1 1 1 1 1 1 1 1 1 ...
$ MalwareDetection : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ Country : Factor w/ 39 levels "Australia","Austria",..: 5 3 32 3 25 5 32 5 39 25 ...
$ NumberOfSections : num 4 4 5 1 5 8 8 5 5 6 ...
$ NumberOfSymbols : num 0 0 0 0 0 0 0 0 0 0 ...
$ SizeOfOptionalHeader : num 224 224 224 224 224 224 224 224 224 224 ...
$ SizeOfCode : num 1100288 4096 27648 0 11776 ...
$ SizeOfInitializedData : num 225792 2560 20480 87552 36352 ...
$ SizeOfUninitializedData : num 0 0 0 0 0 0 0 0 0 0 ...
$ AddressOfEntryPoint : num 1069880 7680 28859 0 13379 ...
$ BaseOfCode : num 4096 4096 4096 4096 4096 ...
$ BaseOfData : num 1110016 8192 32768 4096 16384 ...
$ ChargeCycles : num 1.18e+09 2.68e+08 2.68e+08 2.68e+08 4.19e+06 ...
$ CarMileage : num 1335296 20480 61440 94208 57344 ...
$ SizeOfHeaders : num 1024 1024 1024 512 1024 ...
$ CheckSum : num 1194954 0 67688 113668 69089 ...
$ SizeOfStackReserve : num 1048576 1048576 1048576 1048576 262144 ...
$ SizeOfStackCommit : num 4096 4096 4096 4096 8192 ...
$ SizeOfHeapReserve : num 1048576 1048576 1048576 1048576 1048576 ...
$ SizeOfHeapCommit : num 4096 4096 4096 4096 4096 ...
$ NumberOfRvaAndSizes : num 16 16 16 16 16 16 16 16 16 16 ...
Data visualisation Numerical variables: Univariate histogram analysis
clamp_num1 <- clamp_model[, c("NumberOfSections", "NumberOfRvaAndSizes", "SizeOfOptionalHeader")]
ggplot(melt(clamp_num1), aes(x = value)) +
facet_wrap(~ variable, scales = "free") +
geom_histogram(binwidth = 1, fill = "indianred3", colour="black")+
theme_minimal()+
labs(x = "Factors", y = "Distribution", title = "Histogram of X Factors Part 1") +
theme(plot.title = element_text(hjust = 0.4))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(clamp_num1). In the next version, this warning will become an error.No id variables; using all as measure variables
clamp_num2 <- clamp_model[, c("ChargeCycles", "SizeOfInitializedData", "SizeOfUninitializedData")]
ggplot(melt(clamp_num2), aes(x = value)) +
facet_wrap(~ variable, scales = "free") +
geom_histogram(binwidth = 1000000, fill = "indianred3", colour="black")+
theme_minimal()+
labs(x = "Factors", y = "Distribution", title = "Histogram of X Factors Part 2") +
theme(plot.title = element_text(hjust = 0.4))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(clamp_num2). In the next version, this warning will become an error.No id variables; using all as measure variables
clamp_num3 <- clamp_model[, c("AddressOfEntryPoint", "BaseOfCode", "BaseOfData")]
ggplot(melt(clamp_num3), aes(x = value)) +
facet_wrap(~ variable, scales = "free") +
geom_histogram(binwidth = 1000000, fill = "indianred3", colour="black")+
theme_minimal()+
labs(x = "Factors", y = "Distribution", title = "Histogram of X Factors Part 3") +
theme(plot.title = element_text(hjust = 0.4))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(clamp_num3). In the next version, this warning will become an error.No id variables; using all as measure variables
clamp_num4 <- clamp_model[, c("SizeOfCode", "CarMileage", "SizeOfHeaders")]
ggplot(melt(clamp_num4), aes(x = value)) +
facet_wrap(~ variable, scales = "free") +
geom_histogram(binwidth = 1000000, fill = "indianred3", colour="black")+
theme_minimal()+
labs(x = "Factors", y = "Distribution", title = "Histogram of X Factors Part 4") +
theme(plot.title = element_text(hjust = 0.4))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(clamp_num4). In the next version, this warning will become an error.No id variables; using all as measure variables
clamp_num5 <- clamp_model[, c("CheckSum", "SizeOfStackReserve", "SizeOfStackCommit")]
ggplot(melt(clamp_num5), aes(x = value)) +
facet_wrap(~ variable, scales = "free") +
geom_histogram(binwidth = 1000000, fill = "indianred3", colour="black")+
theme_minimal()+
labs(x = "Factors", y = "Distribution", title = "Histogram of X Factors Part 5") +
theme(plot.title = element_text(hjust = 0.4))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(clamp_num5). In the next version, this warning will become an error.No id variables; using all as measure variables
clamp_num6 <- clamp_model[, c("SizeOfHeapReserve", "SizeOfHeapCommit", "NumberOfSymbols")]
ggplot(melt(clamp_num6), aes(x = value)) +
facet_wrap(~ variable, scales = "free") +
geom_histogram(binwidth = 10000, fill = "indianred3", colour="black")+
theme_minimal()+
labs(x = "Factors", y = "Distribution", title = "Histogram of X Factors Part 6") +
theme(plot.title = element_text(hjust = 0.4))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(clamp_num6). In the next version, this warning will become an error.No id variables; using all as measure variables
Numerical variables: Univariate density analysis
ggplot(melt(clamp_num1), aes(x = value)) +
facet_wrap(~ variable, scales = "free", ncol=1) +
geom_density(fill = "indianred3")+
theme_minimal()+
labs(x = "Factors", y = "Density", title = "Density Plot of X Factors Part 1") +
theme(plot.title = element_text(hjust = 0.4))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(clamp_num1). In the next version, this warning will become an error.No id variables; using all as measure variables
ggplot(melt(clamp_num2), aes(x = value)) +
facet_wrap(~ variable, scales = "free", ncol=1) +
geom_density(fill = "indianred3")+
theme_minimal()+
labs(x = "Factors", y = "Density", title = "Density Plot of X Factors Part 2") +
theme(plot.title = element_text(hjust = 0.4))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(clamp_num2). In the next version, this warning will become an error.No id variables; using all as measure variables
ggplot(melt(clamp_num3), aes(x = value)) +
facet_wrap(~ variable, scales = "free", ncol=1) +
geom_density(fill = "indianred3")+
theme_minimal()+
labs(x = "Factors", y = "Density", title = "Density Plot of X Factors Part 3") +
theme(plot.title = element_text(hjust = 0.4))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(clamp_num3). In the next version, this warning will become an error.No id variables; using all as measure variables
ggplot(melt(clamp_num4), aes(x = value)) +
facet_wrap(~ variable, scales = 'free', ncol=1) +
geom_density(fill = "indianred3")+
theme_minimal()+
labs(x = "Factors", y = "Density", title = "Density Plot of X Factors Part 4") +
theme(plot.title = element_text(hjust = 0.4))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(clamp_num4). In the next version, this warning will become an error.No id variables; using all as measure variables
ggplot(melt(clamp_num5), aes(x = value)) +
facet_wrap(~ variable, scales = 'free', ncol=1) +
geom_density(fill = "indianred3")+
theme_minimal()+
labs(x = "Factors", y = "Density", title = "Density Plot of X Factors Part 5") +
theme(plot.title = element_text(hjust = 0.4))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(clamp_num5). In the next version, this warning will become an error.No id variables; using all as measure variables
ggplot(melt(clamp_num6), aes(x = value)) +
facet_wrap(~ variable, scales = 'free', ncol=1) +
geom_density(fill = "indianred3")+
theme_minimal()+
labs(x = "Factors", y = "Density", title = "Density Plot of X Factors Part 6") +
theme(plot.title = element_text(hjust = 0.4))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(clamp_num6). In the next version, this warning will become an error.No id variables; using all as measure variables
Numerical analysis: Correlation plot
corrData <- copy(clamp_model)
corrData$MalwareDetection <- as.numeric(factor(corrData$MalwareDetection, levels = c("0", "1"), exclude = NULL))
# Correlation Matrix
corrDataNum = corrData[, lapply(corrData, is.numeric) == TRUE ]
corrplot(cor(corrDataNum), type = "upper", title = "Correlation Plot for Numeric Data", mar=c(0,0,1,0),
tl.cex=0.5,
tl.col = "black")
Categorical variables: Univariate barplot analysis
ClampCat1= clamp_model[, c("e_cblp", "e_cp", "e_cparhdr","e_minalloc", "e_maxalloc")]
ggplot(melt(ClampCat1, id.vars="e_maxalloc"), aes(y = value)) +
facet_wrap(~ variable, scales = "free", ncol=2) +
geom_bar(fill = "indianred3",
color="black")+
theme_minimal()+
theme(text = element_text(size=10))+
labs(x = "Factors", y = "Levels", title = "Barplot of Categorical X Factors Part 1") +
theme(plot.title = element_text(hjust = 0.5))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(ClampCat1). In the next version, this warning will become an error.attributes are not identical across measure variables; they will be dropped
ClampCat2= clamp_model[, c("e_maxalloc", "e_ss", "e_sp","e_csum", "e_ip")]
ggplot(melt(ClampCat2, id.vars="e_ip"), aes(y = value)) +
facet_wrap(~ variable, scales = "free", ncol=2) +
geom_bar(fill = "indianred3",
color="black")+
theme_minimal()+
theme(text = element_text(size=10))+
labs(x = "Factors", y = "Levels", title = "Barplot of Categorical X Factors Part 2") +
theme(plot.title = element_text(hjust = 0.5))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(ClampCat2). In the next version, this warning will become an error.attributes are not identical across measure variables; they will be dropped
ClampCat3= clamp_model[, c("e_ip", "e_cs", "e_lfarlc","e_ovno", "e_oemid")]
ggplot(melt(ClampCat3, id.vars="e_oemid"), aes(y = value)) +
facet_wrap(~ variable, scales = "free", ncol=2) +
geom_bar(fill = "indianred3",
color="black")+
theme_minimal()+
theme(text = element_text(size=10))+
labs(x = "Factors", y = "Levels", title = "Barplot of Categorical X Factors Part 3") +
theme(plot.title = element_text(hjust = 0.5))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(ClampCat3). In the next version, this warning will become an error.attributes are not identical across measure variables; they will be dropped
ClampCat4= clamp_model[, c("e_oemid", "e_oeminfo", "Models", "Magic", "PointerToSymbolTable")]
ggplot(melt(ClampCat4, id.vars="PointerToSymbolTable"), aes(y = value)) +
facet_wrap(~ variable, scales = "free", ncol=2) +
geom_bar(fill = "indianred3",
color="black")+
theme_minimal()+
theme(text = element_text(size=10))+
labs(x = "Factors", y = "Levels", title = "Barplot of Categorical X Factors Part 4") +
theme(plot.title = element_text(hjust = 0.5))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(ClampCat4). In the next version, this warning will become an error.attributes are not identical across measure variables; they will be dropped
ClampCat5= clamp_model[, c("PointerToSymbolTable", "SectionAlignment", "FileAlignment","MajorOperatingSystemVersion", "MinorOperatingSystemVersion")]
ggplot(melt(ClampCat5, id.vars="MinorOperatingSystemVersion"), aes(y = value)) +
facet_wrap(~ variable, scales = "free", ncol=2) +
geom_bar(fill = "indianred3",
color="black")+
theme_minimal()+
theme(text = element_text(size=10))+
labs(x = "Factors", y = "Levels", title = "Barplot of Categorical X Factors Part 5") +
theme(plot.title = element_text(hjust = 0.5))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(ClampCat5). In the next version, this warning will become an error.attributes are not identical across measure variables; they will be dropped
ClampCat6= clamp_model[, c("MinorOperatingSystemVersion", "SoftwareVersion", "OSVersion","Subsystem", "LoaderFlags")]
ggplot(melt(ClampCat6, id.vars="LoaderFlags"), aes(y = value)) +
facet_wrap(~ variable, scales = "free", ncol=2) +
geom_bar(fill = "indianred3",
color="black")+
theme_minimal()+
theme(text = element_text(size=10))+
labs(x = "Factors", y = "Levels", title = "Barplot of Categorical X Factors Part 6") +
theme(plot.title = element_text(hjust = 0.5))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(ClampCat6). In the next version, this warning will become an error.attributes are not identical across measure variables; they will be dropped
ClampCat7= clamp_model[, c("LoaderFlags", "MalwareDetection", "Country")]
ggplot(melt(ClampCat7, id.vars="Country"), aes(y = value)) +
facet_wrap(~ variable, scales = "free", ncol=2) +
geom_bar(fill = "indianred3",
color="black")+
theme_minimal()+
theme(text = element_text(size=10))+
labs(x = "Factors", y = "Levels", title = "Barplot of Categorical X Factors Part 7") +
theme(plot.title = element_text(hjust = 0.5))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(ClampCat7). In the next version, this warning will become an error.attributes are not identical across measure variables; they will be dropped
ggplot(clamp_model, aes(x = YearObtained)) +
geom_bar(fill = "indianred3", colour="black", width = 0.5, position = position_dodge(width = 5)) +
labs(x = "Outcome",
y = "Count",
title = "Barplot of Year of Origination Distribution") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), plot.title = element_text(hjust = 0.5))
ggplot(clamp_model, aes(x = Characteristics)) +
geom_bar(fill = "indianred3", colour="black", width = 0.5, position = position_dodge(width = 5)) +
labs(x = "Outcome",
y = "Count",
title = "Barplot of Characteristics Distribution") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), plot.title = element_text(hjust = 0.5))
ClampCat8= clamp_model[, c("MajorLinkerVersion", "MinorLinkerVersion", "MajorImageVersion")]
ggplot(melt(ClampCat8, id.vars="MajorImageVersion"), aes(y = value)) +
facet_wrap(~ variable, scales = "free", ncol=2) +
geom_bar(fill = "indianred3",
color="black")+
theme_minimal()+
theme(text = element_text(size=10))+
labs(x = "Factors", y = "Levels", title = "Barplot of Categorical X Factors Part 8") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), plot.title = element_text(hjust = 0.5))
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(ClampCat8). In the next version, this warning will become an error.attributes are not identical across measure variables; they will be dropped
ggplot(clamp_model, aes(x = MajorImageVersion)) +
geom_bar(fill = "indianred3", colour="black", width = 0.5, position = position_dodge(width = 5)) +
labs(x = "Outcome",
y = "Count",
title = "Barplot of MajorImageVersion Distribution") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), plot.title = element_text(hjust = 0.5))
ggplot(clamp_model, aes(x = MinorImageVersion)) +
geom_bar(fill = "indianred3", colour="black", width = 0.5, position = position_dodge(width = 5)) +
labs(x = "Outcome",
y = "Count",
title = "Barplot of MinorImageVersion Distribution") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), plot.title = element_text(hjust = 0.5))
ggplot(clamp_model, aes(x = DllCharacteristics)) +
geom_bar(fill = "indianred3", colour="black", width = 0.5, position = position_dodge(width = 5)) +
labs(x = "Outcome",
y = "Count",
title = "Barplot of DllCharacteristics Distribution") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), plot.title = element_text(hjust = 0.5))
ggplot(clamp_model, aes(x = Country)) +
geom_bar(fill = "indianred3", colour="black", width = 0.5, position = position_dodge(width = 5)) +
labs(x = "Outcome",
y = "Count",
title = "Barplot of Country Distribution") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), plot.title = element_text(hjust = 0.5))
Bivariate analysis
# Malware Detection against Country (Proportion)
plotdata <- clamp_model %>%
group_by(Country, MalwareDetection) %>%
dplyr::summarize(n = n()) %>%
mutate(pct = n/sum(n),
lbl = scales::percent(pct))
`summarise()` regrouping output by 'Country' (override with `.groups` argument)
ggplot(plotdata, aes(fill=factor(plotdata$MalwareDetection), y=n, x=Country, group = Country)) +
geom_bar(width = 0.5, position="fill", stat="identity")+
theme(axis.text.x = element_text(angle = 90, hjust=1), plot.title = element_text(hjust = 0.5))+
labs(x = "Country", y = "Proportion", title = "MalwareDetection by Country", fill="Malware Detected")
# Malware Detection against Year (Proportion)
plotdata1 <- clamp_model %>%
group_by(YearObtained, MalwareDetection) %>%
dplyr::summarize(n = n()) %>%
mutate(pct = n/sum(n),
lbl = scales::percent(pct))
`summarise()` regrouping output by 'YearObtained' (override with `.groups` argument)
ggplot(plotdata1, aes(fill=factor(plotdata1$MalwareDetection), y=n, x=YearObtained, group = YearObtained)) +
geom_bar(width = 0.5, position="fill", stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust=1), plot.title = element_text(hjust = 0.5)) +
labs(x = "Year", y = "Proportion", title = "Malware Detection by Year", fill="Malware Detected")
# Malware Detection by Software Version
plotdata2 <- clamp_model %>%
group_by(SoftwareVersion, MalwareDetection) %>%
dplyr::summarize(n = n()) %>%
mutate(pct = n/sum(n),
lbl = scales::percent(pct))
`summarise()` regrouping output by 'SoftwareVersion' (override with `.groups` argument)
ggplot(plotdata2, aes(fill=factor(plotdata2$MalwareDetection), y=n, x=SoftwareVersion, group = SoftwareVersion)) +
geom_bar(width = 0.5, position="fill", stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust=1), plot.title = element_text(hjust = 0.5)) +
labs(x = "Software Version", y = "Proportion", title = "Malware Detection by Software Version", fill="Malware Detected")
plotdata3 <- clamp_model %>%
group_by(OSVersion, MalwareDetection) %>%
dplyr::summarize(n = n()) %>%
mutate(pct = n/sum(n),
lbl = scales::percent(pct))
`summarise()` regrouping output by 'OSVersion' (override with `.groups` argument)
ggplot(plotdata3, aes(fill=factor(plotdata3$MalwareDetection), y=n, x=OSVersion, group = OSVersion)) +
geom_bar(width = 0.5, position="fill", stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust=1), plot.title = element_text(hjust = 0.5)) +
labs(x = "OS Version", y = "Proportion", title = "Malware Detection by OS Version", fill="Malware Detected")
plotdata4 <- clamp_model %>%
group_by(Models, MalwareDetection) %>%
dplyr::summarize(n = n()) %>%
mutate(pct = n/sum(n),
lbl = scales::percent(pct))
`summarise()` regrouping output by 'Models' (override with `.groups` argument)
ggplot(plotdata4, aes(fill=factor(plotdata4$MalwareDetection), y=n, x= Models, group = Models)) +
geom_bar(width = 0.5, position="fill", stat="identity") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x = "Tesla Models", y = "Proportion", title = "Malware Detection by Model", fill="Malware Detected")
SMOTE
clamp_smoted <- clamp_model
table(clamp_smoted$MalwareDetection)
0 1
2501 2683
proportion <- data.frame(table(clamp_smoted$MalwareDetection))
if (proportion$Freq[1]<proportion$Freq[2] | proportion$Freq[2]<proportion$Freq[1]){
clamp_smoted <- SMOTE(MalwareDetection ~., clamp_smoted, perc.over = 100, k = 5, perc.under = 200)
}
(table(clamp_smoted$MalwareDetection))
0 1
5002 5002
Association rules: Initial preparation
# Functions used in transforming continuous to discrete data
getBreaks <- function(column_name){
min_value = 0
max_value = max(column_name)
interval = (max_value-min_value)/10
#print(interval)
breaks = c(seq(min_value, max_value, by=interval))
breaks <- ceiling(breaks)
return(breaks)
}
getLabels <- function(column_name){
breaks = getBreaks(column_name)
#print(breaks)
labels <- c()
length <- length(breaks)
#print(length)
for (x in 0:length){
#print(x)
start <- breaks[x]
oneStep <- x+1
end <- breaks[oneStep]-1
#print(start)
#print(end)
if (x == length){
end <- start
start <- breaks[x-1]
string <- paste(toString(start), toString(end), sep="-")
} else{
string <- paste(toString(start), toString(end), sep="-")
}
#print(string)
labels[x] <- string
}
#print(labels)
deleted <- length - 1
labels <- labels[-deleted]
return(labels)
}
#Splitting the continuous columns into intervals to make them discrete by step
clamp_trans <- clamp_model
clamp_nums <- unlist(lapply(clamp_trans, is.numeric))
clamp_nums <- clamp_trans[ , clamp_nums]
names(clamp_nums)
[1] "NumberOfSections" "NumberOfSymbols" "SizeOfOptionalHeader"
[4] "SizeOfCode" "SizeOfInitializedData" "SizeOfUninitializedData"
[7] "AddressOfEntryPoint" "BaseOfCode" "BaseOfData"
[10] "ChargeCycles" "CarMileage" "SizeOfHeaders"
[13] "CheckSum" "SizeOfStackReserve" "SizeOfStackCommit"
[16] "SizeOfHeapReserve" "SizeOfHeapCommit" "NumberOfRvaAndSizes"
clamp_nums$NumberOfSymbols <- cut(clamp_nums$NumberOfSymbols,
breaks = getBreaks(clamp_nums$NumberOfSymbols),
labels = getLabels(clamp_nums$NumberOfSymbols),
right = FALSE)
clamp_nums$SizeOfStackReserve <- cut(clamp_nums$SizeOfStackReserve,
breaks = getBreaks(clamp_nums$SizeOfStackReserve),
labels = getLabels(clamp_nums$SizeOfStackReserve),
right = FALSE)
clamp_nums$SizeOfInitializedData <- cut(clamp_nums$SizeOfInitializedData,
breaks = getBreaks(clamp_nums$SizeOfInitializedData),
labels = getLabels(clamp_nums$SizeOfInitializedData),
right = FALSE)
clamp_nums$SizeOfStackCommit <- cut(clamp_nums$SizeOfStackCommit,
breaks = getBreaks(clamp_nums$SizeOfStackCommit),
labels = getLabels(clamp_nums$SizeOfStackCommit),
right = FALSE)
clamp_nums$AddressOfEntryPoint <- cut(clamp_nums$AddressOfEntryPoint,
breaks = getBreaks(clamp_nums$AddressOfEntryPoint),
labels = getLabels(clamp_nums$AddressOfEntryPoint),
right = FALSE)
clamp_nums$BaseOfCode <- cut(clamp_nums$BaseOfCode,
breaks = getBreaks(clamp_nums$BaseOfCode),
labels = getLabels(clamp_nums$BaseOfCode),
right = FALSE)
clamp_nums$BaseOfData <- cut(clamp_nums$BaseOfData,
breaks = getBreaks(clamp_nums$BaseOfData),
labels = getLabels(clamp_nums$BaseOfData),
right = FALSE)
clamp_nums$ChargeCycles <- cut(clamp_nums$ChargeCycles,
breaks = getBreaks(clamp_nums$ChargeCycles),
labels = getLabels(clamp_nums$ChargeCycles),
right = FALSE)
clamp_nums$SizeOfHeapReserve <- cut(clamp_nums$SizeOfHeapReserve,
breaks = getBreaks(clamp_nums$SizeOfHeapReserve),
labels = getLabels(clamp_nums$SizeOfHeapReserve),
right = FALSE)
clamp_nums$SizeOfHeapCommit <- cut(clamp_nums$SizeOfHeapCommit,
breaks = getBreaks(clamp_nums$SizeOfHeapCommit),
labels = getLabels(clamp_nums$SizeOfHeapCommit),
right = FALSE)
clamp_nums$CarMileage <- cut(clamp_nums$CarMileage,
breaks = getBreaks(clamp_nums$CarMileage),
labels = getLabels(clamp_nums$CarMileage),
right = FALSE)
clamp_nums_name <- names(clamp_nums)
clamp_trans[, clamp_nums_name] <- list(NULL)
clamp_trans <- data.frame(clamp_trans, clamp_nums)
# Converting to transactional data
for(i in 1:ncol(clamp_trans)) clamp_trans[[i]] <- as.factor(clamp_trans[[i]])
trans <- as(clamp_trans, "transactions")
Association rules
rules <- apriori(data=trans, parameter=list(supp=0.45,conf = 0.85), appearance = list (default = "lhs", rhs="MalwareDetection=1"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 2332
set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[4936 item(s), 5184 transaction(s)] done [0.06s].
sorting and recoding items ... [46 item(s)] done [0.01s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6
Mining stopped (time limit reached). Only patterns up to a length of 6 returned!
done [17.55s].
writing ... [525 rule(s)] done [0.12s].
creating S4 object ... done [0.18s].
inspect(head(rules))
lhs rhs support confidence coverage lift count
[1] {OSVersion=V5,
Subsystem=2,
ChargeCycles=0-213365555} => {MalwareDetection=1} 0.4565972 0.8760178 0.5212191 1.692611 2367
[2] {MinorImageVersion=0,
Subsystem=2,
ChargeCycles=0-213365555} => {MalwareDetection=1} 0.4594907 0.8878122 0.5175540 1.715400 2382
[3] {e_sp=184,
OSVersion=V5,
Subsystem=2,
ChargeCycles=0-213365555} => {MalwareDetection=1} 0.4502315 0.8744848 0.5148534 1.689649 2334
[4] {OSVersion=V5,
Subsystem=2,
ChargeCycles=0-213365555,
CarMileage=0-8125235} => {MalwareDetection=1} 0.4546682 0.8798059 0.5167824 1.699931 2357
[5] {e_cparhdr=4,
OSVersion=V5,
Subsystem=2,
ChargeCycles=0-213365555} => {MalwareDetection=1} 0.4535108 0.8752792 0.5181327 1.691184 2351
[6] {e_lfarlc=64,
OSVersion=V5,
Subsystem=2,
ChargeCycles=0-213365555} => {MalwareDetection=1} 0.4544753 0.8755110 0.5190972 1.691632 2356
rules_dataframe <- as(rules, 'data.frame')
rules_no <- apriori(data=trans, parameter=list(supp=0.40,conf = 0.6), appearance = list (default = "lhs", rhs="MalwareDetection=0"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 2073
set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[4936 item(s), 5184 transaction(s)] done [0.06s].
sorting and recoding items ... [46 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6
Mining stopped (time limit reached). Only patterns up to a length of 6 returned!
done [20.09s].
writing ... [180 rule(s)] done [0.15s].
creating S4 object ... done [0.20s].
inspect(head(rules_no))
lhs rhs support confidence coverage lift count
[1] {e_cp=3,
SizeOfUninitializedData=0,
SizeOfHeapReserve=0-3355443,
SizeOfHeapCommit=0-13107} => {MalwareDetection=0} 0.4558256 0.6031138 0.7557870 1.250117 2363
[2] {e_cp=3,
SizeOfUninitializedData=0,
SizeOfStackReserve=0-3355443,
SizeOfHeapReserve=0-3355443} => {MalwareDetection=0} 0.4533179 0.6019467 0.7530864 1.247698 2350
[3] {e_cp=3,
SizeOfUninitializedData=0,
SizeOfStackReserve=0-3355443,
SizeOfHeapCommit=0-13107} => {MalwareDetection=0} 0.4533179 0.6030280 0.7517361 1.249939 2350
[4] {e_cblp=144,
SizeOfUninitializedData=0,
SizeOfHeapReserve=0-3355443,
SizeOfHeapCommit=0-13107} => {MalwareDetection=0} 0.4558256 0.6028061 0.7561728 1.249479 2363
[5] {e_cblp=144,
SizeOfUninitializedData=0,
SizeOfStackReserve=0-3355443,
SizeOfHeapReserve=0-3355443} => {MalwareDetection=0} 0.4533179 0.6016385 0.7534722 1.247059 2350
[6] {e_cblp=144,
SizeOfUninitializedData=0,
SizeOfStackReserve=0-3355443,
SizeOfHeapCommit=0-13107} => {MalwareDetection=0} 0.4533179 0.6027186 0.7521219 1.249298 2350
rules_dataframe_no <- as(rules_no, 'data.frame')
Arules visualisation
library(arulesViz)
plot(rules, method='two-key plot')
#plot(rules, method='two-key plot', engine='interactive')
plot(rules, method = "paracoord")
plot(rules_no, method='two-key plot')
Normalising data for Neural Networks
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
nums <- unlist(lapply(clamp_model, is.numeric))
clamp_num_nn <- clamp_model[ , nums]
normalized <- clamp_num_nn
normalized <- as.data.frame(lapply(normalized, normalize))
#names(mmnums)
clamp_fac_nn <- clamp_model
clamp_fac_nn[, names(clamp_num_nn)] <- list(NULL)
maxmindf <- data.frame(clamp_fac_nn, normalized)
str(maxmindf)
'data.frame': 5184 obs. of 52 variables:
$ e_cblp : Factor w/ 9 levels "0","10","46",..: 8 8 8 8 8 5 8 8 8 8 ...
$ e_cp : Factor w/ 7 levels "0","1","2","3",..: 4 4 4 4 4 3 4 4 4 4 ...
$ e_cparhdr : Factor w/ 3 levels "0","2","4": 3 3 3 3 3 3 3 3 3 3 ...
$ e_minalloc : Factor w/ 4 levels "0","15","16",..: 1 1 1 1 1 2 1 1 1 1 ...
$ e_maxalloc : Factor w/ 3 levels "0","17744","65535": 3 3 3 3 3 3 3 3 3 3 ...
$ e_ss : Factor w/ 2 levels "0","65520": 1 1 1 1 1 1 1 1 1 1 ...
$ e_sp : Factor w/ 8 levels "0","40","64",..: 4 4 4 4 4 4 4 4 4 4 ...
$ e_csum : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
$ e_ip : Factor w/ 4 levels "0","256","1047",..: 1 1 1 1 1 1 1 1 1 1 ...
$ e_cs : Factor w/ 4 levels "0","18293","18919",..: 1 1 1 1 1 1 1 1 1 1 ...
$ e_lfarlc : Factor w/ 3 levels "0","64","65": 2 2 2 2 2 2 2 2 2 2 ...
$ e_ovno : Factor w/ 2 levels "0","26": 1 1 1 1 1 2 1 1 1 1 ...
$ e_oemid : Factor w/ 2 levels "0","267": 1 1 1 1 1 1 1 1 1 1 ...
$ e_oeminfo : Factor w/ 3 levels "0","6","8": 1 1 1 1 1 1 1 1 1 1 ...
$ Models : Factor w/ 3 levels "Model X","Model Y",..: 1 1 1 1 1 1 1 1 1 1 ...
$ YearObtained : Factor w/ 36 levels "1970","1971",..: 16 9 22 21 22 4 22 22 22 20 ...
$ PointerToSymbolTable : Factor w/ 9 levels "0","36384","109088",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Characteristics : Factor w/ 42 levels "34","35","258",..: 27 30 27 27 3 40 30 27 3 11 ...
$ Magic : Factor w/ 2 levels "0","267": 2 2 2 2 2 2 2 2 2 2 ...
$ MajorLinkerVersion : Factor w/ 23 levels "0","1","2","3",..: 9 6 10 10 11 3 7 9 11 10 ...
$ MinorLinkerVersion : Factor w/ 36 levels "0","1","2","3",..: 1 11 1 1 11 22 1 1 11 1 ...
$ SectionAlignment : Factor w/ 6 levels "0","128","256",..: 5 5 5 5 5 5 5 5 5 5 ...
$ FileAlignment : Factor w/ 7 levels "0","128","256",..: 4 4 4 4 4 4 7 7 4 4 ...
$ MajorOperatingSystemVersion: Factor w/ 10 levels "0","1","2","4",..: 4 4 5 6 6 2 4 4 6 5 ...
$ MinorOperatingSystemVersion: Factor w/ 10 levels "0","1","2","3",..: 1 1 1 2 3 1 1 1 3 1 ...
$ MajorImageVersion : Factor w/ 41 levels "0","1","2","3",..: 1 1 1 7 7 1 1 1 7 1 ...
$ MinorImageVersion : Factor w/ 53 levels "0","1","2","3",..: 1 1 1 2 3 1 1 1 3 1 ...
$ SoftwareVersion : Factor w/ 6 levels "0","1","3","4",..: 5 4 5 5 6 4 4 4 6 5 ...
$ OSVersion : Factor w/ 5 levels "V5","V4","V3",..: 2 1 1 1 3 1 1 1 3 1 ...
$ Subsystem : Factor w/ 6 levels "0","1","2","3",..: 4 3 3 3 3 3 3 3 4 4 ...
$ DllCharacteristics : Factor w/ 25 levels "0","1","2","3",..: 5 1 7 12 17 1 1 1 17 17 ...
$ LoaderFlags : Factor w/ 6 levels "0","4357151",..: 1 1 1 1 1 1 1 1 1 1 ...
$ MalwareDetection : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ Country : Factor w/ 39 levels "Australia","Austria",..: 5 3 32 3 25 5 32 5 39 25 ...
$ NumberOfSections : num 0.0909 0.0909 0.1212 0 0.1212 ...
$ NumberOfSymbols : num 0 0 0 0 0 0 0 0 0 0 ...
$ SizeOfOptionalHeader : num 0 0 0 0 0 0 0 0 0 0 ...
$ SizeOfCode : num 3.06e-04 1.14e-06 7.70e-06 0.00 3.28e-06 ...
$ SizeOfInitializedData : num 2.78e-03 3.15e-05 2.52e-04 1.08e-03 4.48e-04 ...
$ SizeOfUninitializedData : num 0 0 0 0 0 0 0 0 0 0 ...
$ AddressOfEntryPoint : num 0.025231 0.000181 0.000681 0 0.000316 ...
$ BaseOfCode : num 9.7e-05 9.7e-05 9.7e-05 9.7e-05 9.7e-05 ...
$ BaseOfData : num 2.62e-02 1.93e-04 7.73e-04 9.66e-05 3.86e-04 ...
$ ChargeCycles : num 0.55533 0.12581 0.12581 0.12581 0.00197 ...
$ CarMileage : num 0.016434 0.000252 0.000756 0.001159 0.000706 ...
$ SizeOfHeaders : num 0.000442 0.000442 0.000442 0.000221 0.000442 ...
$ CheckSum : num 2.78e-04 0.00 1.58e-05 2.65e-05 1.61e-05 ...
$ SizeOfStackReserve : num 0.03125 0.03125 0.03125 0.03125 0.00781 ...
$ SizeOfStackCommit : num 0.00195 0.00195 0.00195 0.00195 0.00391 ...
$ SizeOfHeapReserve : num 0.0312 0.0312 0.0312 0.0312 0.0312 ...
$ SizeOfHeapCommit : num 0.0312 0.0312 0.0312 0.0312 0.0312 ...
$ NumberOfRvaAndSizes : num 1 1 1 1 1 1 1 1 1 1 ...
mmtrain <- sample.split(Y = maxmindf$MalwareDetection, SplitRatio = 0.7)
mmtrainset <- subset(maxmindf, mmtrain == T)
mmtestset <- subset(maxmindf, mmtrain == F)
Training neural network model
nn_model <- nnet(MalwareDetection ~ ., data=mmtrainset, size=22, maxit=50, decay=1.0e-5, MaxNWts=15000)
# weights: 8999
initial value 4195.517673
iter 10 value 565.283776
iter 20 value 262.743727
iter 30 value 114.592404
iter 40 value 70.022117
iter 50 value 60.301821
final value 60.301821
stopped after 50 iterations
nn_predicted <- predict(nn_model, newdata=mmtestset, type="class")
confusionMatrix(as.factor(nn_predicted), mmtestset$MalwareDetection)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 708 15
1 42 790
Accuracy : 0.9633
95% CI : (0.9528, 0.9721)
No Information Rate : 0.5177
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.9265
Mcnemar's Test P-Value : 0.0005736
Sensitivity : 0.9440
Specificity : 0.9814
Pos Pred Value : 0.9793
Neg Pred Value : 0.9495
Prevalence : 0.4823
Detection Rate : 0.4553
Detection Prevalence : 0.4650
Balanced Accuracy : 0.9627
'Positive' Class : 0
Train test split
train <- sample.split(Y = clamp_model$MalwareDetection, SplitRatio = 0.7)
trainset <- subset(clamp_model, train == T)
testset <- subset(clamp_model, train == F)
Grid search algorithm and K-fold Cross Validation
grid_default <- expand.grid(n.trees = 200,
interaction.depth = 1,
shrinkage = 0.1,
n.minobsinnode = 10)
folds=10
cvIndex <- createFolds(factor(trainset$MalwareDetection), folds, returnTrain = T) #stratified k fold
train_control_log <- trainControl(
index = cvIndex,
number = folds,
method = "cv",
)
Logistic Regression
logistic <- train(MalwareDetection~., data=trainset, trControl = train_control_log, method = "glm", family=binomial)
glm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
logreg_probs <- predict(logistic, newdata = testset, type = 'prob')
prediction from a rank-deficient fit may be misleading
threshold <- 0.5
logreg_predicted <- data.table(ifelse(logreg_probs > 0.5, 1, 0))
confusionMatrix(as.factor(logreg_predicted$`1`), testset$MalwareDetection)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 720 55
1 30 750
Accuracy : 0.9453
95% CI : (0.9329, 0.9561)
No Information Rate : 0.5177
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.8907
Mcnemar's Test P-Value : 0.009237
Sensitivity : 0.9600
Specificity : 0.9317
Pos Pred Value : 0.9290
Neg Pred Value : 0.9615
Prevalence : 0.4823
Detection Rate : 0.4630
Detection Prevalence : 0.4984
Balanced Accuracy : 0.9458
'Positive' Class : 0
Checking for multicollinearity
# logistic_check <- glm(MalwareDetection ~., data = trainset, family = binomial)
# car::vif(logistic_check)
# It returns an error: there are aliased coefficients in the model
# This means that we have ran into perfect multicollinearity.
# The column involved is "NumberOfRvaAndSizes" which is removed in feature selection process.
Random Forest
randomForest_model <- randomForest(
MalwareDetection ~ .,
data=trainset,
tuneGrid = grid_default,
trControl = train_control
)
rf_predicted <- predict(randomForest_model, newdata = testset)
confusionMatrix(rf_predicted, testset$MalwareDetection)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 729 5
1 21 800
Accuracy : 0.9833
95% CI : (0.9756, 0.989)
No Information Rate : 0.5177
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.9665
Mcnemar's Test P-Value : 0.003264
Sensitivity : 0.9720
Specificity : 0.9938
Pos Pred Value : 0.9932
Neg Pred Value : 0.9744
Prevalence : 0.4823
Detection Rate : 0.4688
Detection Prevalence : 0.4720
Balanced Accuracy : 0.9829
'Positive' Class : 0
Applying Feature Selection using Boruta
boruta <- Boruta(MalwareDetection ~ ., data = clamp_model, doTrace = 2, maxRuns=11)
1. run of importance source...
2. run of importance source...
3. run of importance source...
4. run of importance source...
5. run of importance source...
6. run of importance source...
7. run of importance source...
8. run of importance source...
9. run of importance source...
10. run of importance source...
#print(boruta)
plot(boruta, las = 2, cex.axis = 0.7)
#plotImpHistory(boruta)
bor <- TentativeRoughFix(boruta)
#print(bor)
attStats(bor)
#getSelectedAttributes(bor, withTentative = F)
selected_features <- getSelectedAttributes(bor, withTentative = F)
clamp_selected <- clamp_model[, selected_features]
clamp_selected$MalwareDetection <- clamp_model$MalwareDetection
Normalising numerical data
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
nums <- unlist(lapply(clamp_selected, is.numeric))
clamp_num_nn_s <- clamp_selected[ , nums]
normalized <- clamp_num_nn_s
normalized <- as.data.frame(lapply(normalized, normalize))
#names(mmnums)
clamp_fac_nn_s <- clamp_selected
clamp_fac_nn_s[, names(clamp_num_nn_s)] <- list(NULL)
maxmindf_s <- data.frame(clamp_fac_nn_s, normalized)
str(maxmindf_s)
'data.frame': 5184 obs. of 50 variables:
$ e_cblp : Factor w/ 9 levels "0","10","46",..: 8 8 8 8 8 5 8 8 8 8 ...
$ e_cp : Factor w/ 7 levels "0","1","2","3",..: 4 4 4 4 4 3 4 4 4 4 ...
$ e_cparhdr : Factor w/ 3 levels "0","2","4": 3 3 3 3 3 3 3 3 3 3 ...
$ e_minalloc : Factor w/ 4 levels "0","15","16",..: 1 1 1 1 1 2 1 1 1 1 ...
$ e_maxalloc : Factor w/ 3 levels "0","17744","65535": 3 3 3 3 3 3 3 3 3 3 ...
$ e_ss : Factor w/ 2 levels "0","65520": 1 1 1 1 1 1 1 1 1 1 ...
$ e_sp : Factor w/ 8 levels "0","40","64",..: 4 4 4 4 4 4 4 4 4 4 ...
$ e_csum : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
$ e_ip : Factor w/ 4 levels "0","256","1047",..: 1 1 1 1 1 1 1 1 1 1 ...
$ e_cs : Factor w/ 4 levels "0","18293","18919",..: 1 1 1 1 1 1 1 1 1 1 ...
$ e_lfarlc : Factor w/ 3 levels "0","64","65": 2 2 2 2 2 2 2 2 2 2 ...
$ e_ovno : Factor w/ 2 levels "0","26": 1 1 1 1 1 2 1 1 1 1 ...
$ e_oemid : Factor w/ 2 levels "0","267": 1 1 1 1 1 1 1 1 1 1 ...
$ e_oeminfo : Factor w/ 3 levels "0","6","8": 1 1 1 1 1 1 1 1 1 1 ...
$ Models : Factor w/ 3 levels "Model X","Model Y",..: 1 1 1 1 1 1 1 1 1 1 ...
$ YearObtained : Factor w/ 36 levels "1970","1971",..: 16 9 22 21 22 4 22 22 22 20 ...
$ PointerToSymbolTable : Factor w/ 9 levels "0","36384","109088",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Characteristics : Factor w/ 42 levels "34","35","258",..: 27 30 27 27 3 40 30 27 3 11 ...
$ Magic : Factor w/ 2 levels "0","267": 2 2 2 2 2 2 2 2 2 2 ...
$ MajorLinkerVersion : Factor w/ 23 levels "0","1","2","3",..: 9 6 10 10 11 3 7 9 11 10 ...
$ MinorLinkerVersion : Factor w/ 36 levels "0","1","2","3",..: 1 11 1 1 11 22 1 1 11 1 ...
$ SectionAlignment : Factor w/ 6 levels "0","128","256",..: 5 5 5 5 5 5 5 5 5 5 ...
$ FileAlignment : Factor w/ 7 levels "0","128","256",..: 4 4 4 4 4 4 7 7 4 4 ...
$ MajorOperatingSystemVersion: Factor w/ 10 levels "0","1","2","4",..: 4 4 5 6 6 2 4 4 6 5 ...
$ MinorOperatingSystemVersion: Factor w/ 10 levels "0","1","2","3",..: 1 1 1 2 3 1 1 1 3 1 ...
$ MajorImageVersion : Factor w/ 41 levels "0","1","2","3",..: 1 1 1 7 7 1 1 1 7 1 ...
$ MinorImageVersion : Factor w/ 53 levels "0","1","2","3",..: 1 1 1 2 3 1 1 1 3 1 ...
$ SoftwareVersion : Factor w/ 6 levels "0","1","3","4",..: 5 4 5 5 6 4 4 4 6 5 ...
$ OSVersion : Factor w/ 5 levels "V5","V4","V3",..: 2 1 1 1 3 1 1 1 3 1 ...
$ Subsystem : Factor w/ 6 levels "0","1","2","3",..: 4 3 3 3 3 3 3 3 4 4 ...
$ DllCharacteristics : Factor w/ 25 levels "0","1","2","3",..: 5 1 7 12 17 1 1 1 17 17 ...
$ LoaderFlags : Factor w/ 6 levels "0","4357151",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Country : Factor w/ 39 levels "Australia","Austria",..: 5 3 32 3 25 5 32 5 39 25 ...
$ MalwareDetection : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ NumberOfSections : num 0.0909 0.0909 0.1212 0 0.1212 ...
$ NumberOfSymbols : num 0 0 0 0 0 0 0 0 0 0 ...
$ SizeOfOptionalHeader : num 0 0 0 0 0 0 0 0 0 0 ...
$ SizeOfCode : num 3.06e-04 1.14e-06 7.70e-06 0.00 3.28e-06 ...
$ SizeOfInitializedData : num 2.78e-03 3.15e-05 2.52e-04 1.08e-03 4.48e-04 ...
$ SizeOfUninitializedData : num 0 0 0 0 0 0 0 0 0 0 ...
$ AddressOfEntryPoint : num 0.025231 0.000181 0.000681 0 0.000316 ...
$ BaseOfCode : num 9.7e-05 9.7e-05 9.7e-05 9.7e-05 9.7e-05 ...
$ BaseOfData : num 2.62e-02 1.93e-04 7.73e-04 9.66e-05 3.86e-04 ...
$ ChargeCycles : num 0.55533 0.12581 0.12581 0.12581 0.00197 ...
$ CarMileage : num 0.016434 0.000252 0.000756 0.001159 0.000706 ...
$ CheckSum : num 2.78e-04 0.00 1.58e-05 2.65e-05 1.61e-05 ...
$ SizeOfStackReserve : num 0.03125 0.03125 0.03125 0.03125 0.00781 ...
$ SizeOfStackCommit : num 0.00195 0.00195 0.00195 0.00195 0.00391 ...
$ SizeOfHeapReserve : num 0.0312 0.0312 0.0312 0.0312 0.0312 ...
$ SizeOfHeapCommit : num 0.0312 0.0312 0.0312 0.0312 0.0312 ...
#maxmindf <- maxmindf %>% group_by(HasDetections) %>% sample_frac(.7)
#maxmindf <- one_hot(as.data.table(maxmindf))
mmtrain_s <- sample.split(Y = maxmindf_s$MalwareDetection, SplitRatio = 0.7)
mmtrainset_s <- subset(maxmindf_s, mmtrain == T)
mmtestset_s <- subset(maxmindf_s, mmtrain == F)
Training neural network
start.time <- Sys.time()
nn_model_s <- nnet(MalwareDetection ~ ., data=mmtrainset_s, size=22, maxit=50, decay=1.0e-5, MaxNWts=15000)
# weights: 8955
initial value 2295.547194
iter 10 value 568.649266
iter 20 value 238.634120
iter 30 value 109.412072
iter 40 value 71.103822
iter 50 value 60.547338
final value 60.547338
stopped after 50 iterations
nn_predicted_s <- predict(nn_model_s, newdata=mmtestset_s, type="class")
end.time <- Sys.time()
time.taken <- end.time - start.time
confusionMatrix(as.factor(nn_predicted_s), mmtestset_s$MalwareDetection)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 718 17
1 32 788
Accuracy : 0.9685
95% CI : (0.9586, 0.9766)
No Information Rate : 0.5177
P-Value [Acc > NIR] : <2e-16
Kappa : 0.9369
Mcnemar's Test P-Value : 0.0455
Sensitivity : 0.9573
Specificity : 0.9789
Pos Pred Value : 0.9769
Neg Pred Value : 0.9610
Prevalence : 0.4823
Detection Rate : 0.4617
Detection Prevalence : 0.4727
Balanced Accuracy : 0.9681
'Positive' Class : 0
Neural Network Graph
hiddenNodes <- c(1:30)
accuracy <- c()
for (i in c(1:30)) {
nn_model <- nnet(MalwareDetection ~ ., data=mmtrainset, size=i, maxit=50, decay=1.0e-5, MaxNWts=15000)
nn_predicted <- predict(nn_model, newdata=mmtestset, type="class")
cm <- confusionMatrix(as.factor(nn_predicted), mmtestset$MalwareDetection)
overall <- cm$overall['Accuracy']
accuracy[i] <- overall
}
# weights: 410
initial value 2671.982872
iter 10 value 2126.076353
iter 20 value 1407.884143
iter 30 value 1196.138169
iter 40 value 1119.350089
iter 50 value 993.896612
final value 993.896612
stopped after 50 iterations
# weights: 819
initial value 2673.989067
iter 10 value 922.851035
iter 20 value 382.267766
iter 30 value 247.099615
iter 40 value 195.009754
iter 50 value 137.782835
final value 137.782835
stopped after 50 iterations
# weights: 1228
initial value 3021.335418
iter 10 value 593.152167
iter 20 value 346.071644
iter 30 value 273.037841
iter 40 value 231.001431
iter 50 value 209.236724
final value 209.236724
stopped after 50 iterations
# weights: 1637
initial value 2801.006363
iter 10 value 574.665580
iter 20 value 417.066881
iter 30 value 294.523259
iter 40 value 191.754139
iter 50 value 136.358484
final value 136.358484
stopped after 50 iterations
# weights: 2046
initial value 2545.263804
iter 10 value 741.687474
iter 20 value 526.899604
iter 30 value 481.641657
iter 40 value 461.502286
iter 50 value 431.749635
final value 431.749635
stopped after 50 iterations
# weights: 2455
initial value 2990.123625
iter 10 value 593.089519
iter 20 value 332.988040
iter 30 value 203.478787
iter 40 value 135.171911
iter 50 value 109.187541
final value 109.187541
stopped after 50 iterations
# weights: 2864
initial value 2592.242402
iter 10 value 571.854797
iter 20 value 237.321420
iter 30 value 123.476115
iter 40 value 72.360441
iter 50 value 60.412669
final value 60.412669
stopped after 50 iterations
# weights: 3273
initial value 2491.936931
iter 10 value 524.553265
iter 20 value 209.770882
iter 30 value 103.028849
iter 40 value 77.189834
iter 50 value 65.384928
final value 65.384928
stopped after 50 iterations
# weights: 3682
initial value 2698.676744
iter 10 value 574.151509
iter 20 value 291.560191
iter 30 value 190.517970
iter 40 value 133.648717
iter 50 value 108.605919
final value 108.605919
stopped after 50 iterations
# weights: 4091
initial value 3029.962598
iter 10 value 478.633536
iter 20 value 225.451755
iter 30 value 116.176413
iter 40 value 73.932974
iter 50 value 59.604135
final value 59.604135
stopped after 50 iterations
# weights: 4500
initial value 2500.565414
iter 10 value 494.758127
iter 20 value 270.433034
iter 30 value 157.178160
iter 40 value 86.650943
iter 50 value 63.221677
final value 63.221677
stopped after 50 iterations
# weights: 4909
initial value 2861.710939
iter 10 value 797.744459
iter 20 value 274.210555
iter 30 value 153.317117
iter 40 value 103.429514
iter 50 value 84.882964
final value 84.882964
stopped after 50 iterations
# weights: 5318
initial value 2378.178728
iter 10 value 513.421647
iter 20 value 234.947893
iter 30 value 115.498690
iter 40 value 76.266920
iter 50 value 59.833226
final value 59.833226
stopped after 50 iterations
# weights: 5727
initial value 2756.368652
iter 10 value 558.748167
iter 20 value 261.857410
iter 30 value 120.287767
iter 40 value 74.088022
iter 50 value 60.589526
final value 60.589526
stopped after 50 iterations
# weights: 6136
initial value 3780.981689
iter 10 value 704.636538
iter 20 value 280.500632
iter 30 value 150.561985
iter 40 value 84.642487
iter 50 value 68.387274
final value 68.387274
stopped after 50 iterations
# weights: 6545
initial value 2667.008545
iter 10 value 513.026124
iter 20 value 244.168848
iter 30 value 112.740442
iter 40 value 72.711167
iter 50 value 60.120899
final value 60.120899
stopped after 50 iterations
# weights: 6954
initial value 3050.592090
iter 10 value 543.629601
iter 20 value 280.074033
iter 30 value 136.988539
iter 40 value 84.753929
iter 50 value 63.002270
final value 63.002270
stopped after 50 iterations
# weights: 7363
initial value 3990.891308
iter 10 value 503.448214
iter 20 value 277.621518
iter 30 value 117.199154
iter 40 value 75.447335
iter 50 value 58.992919
final value 58.992919
stopped after 50 iterations
# weights: 7772
initial value 3112.566160
iter 10 value 614.027262
iter 20 value 328.173624
iter 30 value 147.676139
iter 40 value 83.185941
iter 50 value 64.023913
final value 64.023913
stopped after 50 iterations
# weights: 8181
initial value 2498.444171
iter 10 value 418.763493
iter 20 value 215.217376
iter 30 value 102.464652
iter 40 value 67.253166
iter 50 value 58.403429
final value 58.403429
stopped after 50 iterations
# weights: 8590
initial value 2892.817994
iter 10 value 430.922555
iter 20 value 222.182939
iter 30 value 114.451184
iter 40 value 70.256498
iter 50 value 60.898330
final value 60.898330
stopped after 50 iterations
# weights: 8999
initial value 3269.031378
iter 10 value 667.842892
iter 20 value 260.200717
iter 30 value 124.559795
iter 40 value 75.685170
iter 50 value 59.975700
final value 59.975700
stopped after 50 iterations
# weights: 9408
initial value 2834.287033
iter 10 value 428.919046
iter 20 value 227.116371
iter 30 value 102.462261
iter 40 value 72.267606
iter 50 value 59.816149
final value 59.816149
stopped after 50 iterations
# weights: 9817
initial value 2567.013898
iter 10 value 490.325549
iter 20 value 231.201564
iter 30 value 113.707183
iter 40 value 70.695450
iter 50 value 59.513310
final value 59.513310
stopped after 50 iterations
# weights: 10226
initial value 4020.994758
iter 10 value 580.138791
iter 20 value 306.579662
iter 30 value 196.483623
iter 40 value 138.645752
iter 50 value 87.361282
final value 87.361282
stopped after 50 iterations
# weights: 10635
initial value 2826.197027
iter 10 value 439.062768
iter 20 value 191.969730
iter 30 value 97.324507
iter 40 value 65.193102
iter 50 value 58.017921
final value 58.017921
stopped after 50 iterations
# weights: 11044
initial value 2965.033025
iter 10 value 521.295521
iter 20 value 228.361348
iter 30 value 95.235391
iter 40 value 65.470190
iter 50 value 56.666725
final value 56.666725
stopped after 50 iterations
# weights: 11453
initial value 2349.076175
iter 10 value 475.786309
iter 20 value 195.729425
iter 30 value 105.273608
iter 40 value 70.041168
iter 50 value 60.105414
final value 60.105414
stopped after 50 iterations
# weights: 11862
initial value 3606.736557
iter 10 value 470.569751
iter 20 value 258.697064
iter 30 value 126.105322
iter 40 value 75.554895
iter 50 value 61.646409
final value 61.646409
stopped after 50 iterations
# weights: 12271
initial value 2670.285039
iter 10 value 507.348814
iter 20 value 210.723451
iter 30 value 112.198396
iter 40 value 68.670631
iter 50 value 58.686489
final value 58.686489
stopped after 50 iterations
accuracy
[1] 0.9106109 0.9633441 0.9588424 0.9581994 0.9556270 0.9691318 0.9646302 0.9614148 0.9575563 0.9646302
[11] 0.9672026 0.9646302 0.9665595 0.9639871 0.9614148 0.9652733 0.9665595 0.9646302 0.9659164 0.9633441
[21] 0.9678457 0.9710611 0.9646302 0.9627010 0.9684887 0.9652733 0.9627010 0.9639871 0.9594855 0.9633441
plot(hiddenNodes, accuracy, ylab="Model Accuracy", xlab="Number of Hidden Nodes")
lines(hiddenNodes, accuracy)
data <- data.frame(hiddenNodes, accuracy)
names(data) <- c("Number of Hidden Nodes", "Model Accuracy")
f <- list(
family = "Courier New, monospace",
size = 18,
color = "#7f7f7f"
)
x <- list(
title = "Number of Hidden Nodes",
titlefont = f
)
y <- list(
title = "Model Accuracy",
titlefont = f
)
fig <- plot_ly(data, x = ~hiddenNodes, y = ~accuracy, type = 'scatter', mode = 'lines')
fig <- fig %>% layout(xaxis = x, yaxis = y)
fig
`arrange_()` is deprecated as of dplyr 0.7.0.
Please use `arrange()` instead.
See vignette('programming') for more help
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
Train Test Split
train_s <- sample.split(Y = clamp_selected$MalwareDetection, SplitRatio = 0.7)
trainset_s <- subset(clamp_selected, train == T)
testset_s <- subset(clamp_selected, train == F)
Logistic Regression
start.time <- Sys.time()
logistic_selected <- train(MalwareDetection~., data=trainset_s, trControl = train_control_log, method = "glm", family=binomial)
glm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: fitted probabilities numerically 0 or 1 occurredprediction from a rank-deficient fit may be misleadingglm.fit: fitted probabilities numerically 0 or 1 occurred
logreg_probs_s <- predict(logistic_selected, newdata = testset_s, type = 'prob')
prediction from a rank-deficient fit may be misleading
threshold <- 0.5
logreg_predicted_s <- data.table(ifelse(logreg_probs_s > 0.5, 1, 0))
end.time <- Sys.time()
time.taken <- end.time - start.time
confusionMatrix(as.factor(logreg_predicted_s$`1`), testset_s$MalwareDetection)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 701 38
1 49 767
Accuracy : 0.9441
95% CI : (0.9314, 0.9549)
No Information Rate : 0.5177
P-Value [Acc > NIR] : <2e-16
Kappa : 0.8879
Mcnemar's Test P-Value : 0.2837
Sensitivity : 0.9347
Specificity : 0.9528
Pos Pred Value : 0.9486
Neg Pred Value : 0.9400
Prevalence : 0.4823
Detection Rate : 0.4508
Detection Prevalence : 0.4752
Balanced Accuracy : 0.9437
'Positive' Class : 0
start.time <- Sys.time()
randomForest_model_s <- randomForest(
MalwareDetection ~ .,
data=trainset_s,
tuneGrid = grid_default,
trControl = train_control
)
rf_predicted_s <- predict(randomForest_model_s, newdata = testset_s)
end.time <- Sys.time()
time.taken <- end.time - start.time
confusionMatrix(rf_predicted_s, testset_s$MalwareDetection)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 728 3
1 22 802
Accuracy : 0.9839
95% CI : (0.9764, 0.9896)
No Information Rate : 0.5177
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.9678
Mcnemar's Test P-Value : 0.0003182
Sensitivity : 0.9707
Specificity : 0.9963
Pos Pred Value : 0.9959
Neg Pred Value : 0.9733
Prevalence : 0.4823
Detection Rate : 0.4682
Detection Prevalence : 0.4701
Balanced Accuracy : 0.9835
'Positive' Class : 0
csvMatrix <- confusionMatrix(rf_predicted_s, testset_s$MalwareDetection)
tocsv <- data.frame(cbind(t(csvMatrix$overall)))
write.csv(tocsv,file="csvMatrix.csv")